This notebook analyzes the sentiment in tweets about two president candiate, Joe Biden and Donard Trump. See if we have some interesting results

Import modules
import numpy as np
import pandas as pd
joe_biden = pd.read_csv(r'hashtag_joebiden.csv')
joe_biden.head()
donard_trump = pd.read_csv('hashtag_donaldtrump.csv', lineterminator='\n', parse_dates=True)
donard_trump.head()
joe_biden = joe_biden[joe_biden['tweet'].notna()]
joe_biden['tweet'] = joe_biden['tweet'].str.replace(r"[^A-Za-z0-9]", ' ')
#remove null value
joe_biden.dropna(inplace=True)
blanks = [] # start with an empty list
for row in joe_biden.itertuples(): # iterate over the DataFrame
if type(row[2])==str: # avoid NaN values
if row[2].isspace(): # check for whitespace
blanks.append(row['Index']) # add matching index numbers to the list
joe_biden.drop(blanks, inplace=True)
donard_trump = donard_trump[donard_trump['tweet'].notna()]
donard_trump['tweet'] = donard_trump['tweet'].str.replace(r"[^A-Za-z0-9]", ' ')
#remove null value
donard_trump.dropna(inplace=True)
blanks = [] # start with an empty list
for row in donard_trump.itertuples(): # iterate over the DataFrame
if type(row[2])==str: # avoid NaN values
if row[2].isspace(): # check for whitespace
blanks.append(row['Index']) # add matching index numbers to the list
donard_trump.drop(blanks, inplace=True)
joe_biden=joe_biden[joe_biden.country == 'United States of America']
donard_trump = donard_trump[donard_trump.country == 'United States of America']
!pip install langdetect
Really lengthy operation, I exported the results to a csv
# from langdetect import detect
# joe_biden['lang'] = joe_biden['tweet'].apply(detect)
# joe_biden[['tweet','lang']].head()
# joe_biden = joe_biden[joe_biden['lang']== 'en']
# langs = []
# for row in donard_trump.itertuples():
# try:
# lang = detect(str(row[3]))
# print((row.Index,lang))
# langs.append((row.Index,lang))
# except LangDetectException:
# langs.append((row.Index,'Error'))
# langs_trump_merge = langs_trump_merge[langs_trump_merge['lang']== 'en']
# langs_trump_merge.to_csv('donard_trump.csv')
# langs_trump_df = pd.DataFrame(langs,columns=['index','lang']).set_index(['index'])
# langs_trump_merge = donard_trump.merge(langs_trump_df, left_index=True, right_index=True)
joe_biden = pd.read_csv(r'joe_biden.csv')
joe_biden.head()
donard_trump = pd.read_csv('donard_trump_en.csv')
donard_trump.head()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def rate_tweet(string):
string = str(string)
score = sid.polarity_scores(string)
if score['compound'] > 0:
return "POS"
elif score['compound'] == 0:
return "NEUTR"
else:
return "NEG"
def sentiment_score(string):
string = str(string)
score = sid.polarity_scores(string)
return score['compound']
joe_biden['sentiment_score'] = joe_biden['tweet'].apply(sentiment_score)
joe_biden['sentiment'] = joe_biden['tweet'].apply(rate_tweet)
joe_biden[['tweet','lang','sentiment','sentiment_score']].head()
donard_trump['sentiment_score'] = donard_trump['tweet'].apply(sentiment_score)
donard_trump['sentiment'] = donard_trump['tweet'].apply(rate_tweet)
donard_trump[['tweet','lang','sentiment','sentiment_score']].head()
sentimental_summary_by_State = joe_biden[joe_biden.country == 'United States of America'].groupby(['state','sentiment']).size()
sentimental_percentage_summary_by_state = sentimental_summary_by_State.groupby(level=0).apply(lambda x:
x / float(x.sum()))
percentage = pd.DataFrame(sentimental_percentage_summary_by_state, columns=['percentage_of_sentiement'])
pd.set_option('display.max_rows', 500)
format_percentage = {'percentage_of_sentiement': '{:.2%}'}
percentage.style.format(format_percentage)
donard_trump_sentimental_summary_by_State = donard_trump[donard_trump.country == 'United States of America'].groupby(['state','sentiment']).size()
donard_trump_sentimental_percentage_summary_by_state = donard_trump_sentimental_summary_by_State.groupby(level=0).apply(lambda x:
x / float(x.sum()))
donard_trump_percentage = pd.DataFrame(donard_trump_sentimental_percentage_summary_by_state, columns=['percentage_of_sentiement'])
pd.set_option('display.max_rows', 500)
format_percentage = {'percentage_of_sentiement': '{:.2%}'}
donard_trump_percentage.style.format(format_percentage)
This show percentage of postive, negative, and neutral tweets for each state.
import plotly.graph_objects as go
def scale(df):
return df.sentiment_score * 100
state_over_all = joe_biden[joe_biden.country == 'United States of America'].groupby(['state_code']).apply(scale)
state_scale = state_over_all.groupby(level=0).apply(lambda x: x.mean())
joe_biden['text'] = joe_biden['state'].astype(str) + '<br>' + \
'Sentiment Score: ' + joe_biden['sentiment_score'].astype(str) + '<br>'
state_scale = state_scale.reset_index()
fig = go.Figure(data=go.Choropleth(
locations=state_scale['state_code'], # Spatial coordinates
z = state_scale['sentiment_score'].astype(float), # Data to be color-coded'
text = joe_biden['text'],
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Blues',
colorbar_title = "sentiment scale",
))
fig.update_layout(
title_text = 'Joe Biden Sentimental Scale',
geo_scope='usa', # limite map scope to USA
)
fig.show()
def scale(df):
return df.sentiment_score * 100
state_over_all = donard_trump[donard_trump.country == 'United States of America'].groupby(['state_code']).apply(scale)
state_scale = state_over_all.groupby(level=0).apply(lambda x: x.mean())
state_scale = state_scale.reset_index()
fig = go.Figure(data=go.Choropleth(
locations=state_scale['state_code'], # Spatial coordinates
z = state_scale['sentiment_score'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Reds',
colorbar_title = "sentiment scale",
))
fig.update_layout(
title_text = 'Donard Trump Sentimental Scale',
geo_scope='usa', # limite map scope to USA
)
fig.show()
all_joe_biden_rate = joe_biden.sentiment_score.sum()
all_joe_biden_rate
all_donard_trump_rate = donard_trump.sentiment_score.sum()
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
plt.style.use('default')
x = ['Donard Trump']
x2 = ['Joe Biden']
biden_rating = [(float(all_joe_biden_rate)/10000)*100 ]
Donard_rating = [(float(all_donard_trump_rate)/10000)*100]
N= np.arange(2)
x_pos = [i for i, _ in enumerate(np.concatenate((x, x2)))]
plt.bar(x, Donard_rating, color ='red', label='Trump Rating',
width = 0.4)
plt.bar(x2, biden_rating, color ='Blue', label='Biden Rating',
width = 0.4)
plt.xlabel("President Candidate")
plt.ylabel("Sentimental Score")
plt.title("Sentimental Analysis for Each President Candidate")
plt.legend(loc="upper left")
plt.xticks(x_pos, np.concatenate((x, x2)))
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()